s5_220301_human_tnbc_integration_rliger

nlc

2/8/2022

Set up

Set seed

set.seed(5)

Load libraries

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(Seurat)
## Attaching SeuratObject
library(ggalluvial)

Load wu et all data, normalize and run PCA/UMAP

if(file.exists('analysis_files/s5_so_wu.rds')){
  so_wu <- readRDS('analysis_files/s5_so_wu.rds')
}else{
  print('Wu et al Seurat object not found, loading from publication files')
  
  # Read counts
  wu_10x <- Read10X('/Users/Nick/OneDrive - Oregon Health & Science University/public_data/wu_et_al_2021_nat_gen/data/')
  
  # Read metadata & convert to Seurat accepted format
  wu_meta <- read_csv('/Users/Nick/OneDrive - Oregon Health & Science University/public_data/wu_et_al_2021_nat_gen/Whole_miniatlas_meta.csv',
                      lazy = FALSE)[-1,] %>%
    mutate(Percent_mito = as.numeric(Percent_mito)) %>%
    mutate(nCount_RNA = as.numeric(nCount_RNA)) %>%
    mutate(nFeature_RNA = as.numeric(nFeature_RNA))
  
  wu_meta_df <- as.data.frame(wu_meta)
  row.names(wu_meta_df) <- wu_meta_df$NAME
  
  # Read patient metadata and add to wu_meta_df
  wu_patient_meta <- read_csv('/Users/Nick/OneDrive - Oregon Health & Science University/public_data/wu_et_al_2021_nat_gen/supplementary_table1.csv',
                              lazy = FALSE) %>%
    janitor::clean_names() %>%
    mutate(patient_id = paste0('CID', str_remove(case_id, pattern = '-|A|N'))) %>%
    mutate(her2_status = subtype_by_ihc %in% c('HER2+/ER+', 'HER2+')) %>%
    mutate(er_status = subtype_by_ihc %in% c('HER2+/ER+', 'ER+'))
  
  # Add ER and HER2 status back to scRNA-seq meta for easier plotting later
  
  wu_meta_df$her2_status <- plyr::mapvalues(x = str_remove(wu_meta_df$Patient, pattern = 'A|N'),
                                        from = wu_patient_meta$patient_id,
                                        to = wu_patient_meta$her2_status)
  
  wu_meta_df$er_status <- plyr::mapvalues(x = str_remove(wu_meta_df$Patient, pattern = 'A|N'),
                                        from = wu_patient_meta$patient_id,
                                        to = wu_patient_meta$er_status)
  
  wu_meta_df$subtype_by_ihc <- plyr::mapvalues(x = str_remove(wu_meta_df$Patient, pattern = 'A|N'),
                                        from = wu_patient_meta$patient_id,
                                        to = wu_patient_meta$subtype_by_ihc) 
  
  
  # Confirm all cell barcodes have meta data
  sum(colnames(wu_10x) %in% row.names(wu_meta_df))/length(unique(colnames(wu_10x), row.names(wu_meta_df)))
  
  # Create seurat object
  so_wu <- CreateSeuratObject(counts = wu_10x,
                              meta.data = wu_meta_df,
                              project = 'wu_2020_natgen')
  
  wu_10x <- wu_meta <- wu_meta_df <- NULL
  gc()
  
  
  ## Normalize, find variable features and scale
  
  so_wu <- NormalizeData(so_wu)
  so_wu <- FindVariableFeatures(so_wu)
  top_vf <- head(VariableFeatures(so_wu), 50)
  LabelPoints(plot = VariableFeaturePlot(so_wu),
              points = top_vf)
  so_wu <- ScaleData(so_wu)
  
  ## PCA and UMAP
  so_wu <- RunPCA(so_wu,
                npcs = 50)

  ElbowPlot(so_wu,
            ndims = 50)
  
  so_wu <- RunUMAP(so_wu,
                   dims = 1:20)
  
  saveRDS(so_wu, 'analysis_files/s5_so_wu.rds')
}

Wu et all analysis

QC was already performed by original authors

FeatureScatter(so_wu,
               feature1 = 'nCount_RNA',
               feature2 = 'nFeature_RNA')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

FeatureScatter(so_wu,
               feature1 = 'nCount_RNA',
               feature2 = 'Percent_mito')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

range(so_wu@meta.data$nCount_RNA)
## [1]    252 181558
range(so_wu@meta.data$nFeature_RNA)
## [1]   201 10358
range(so_wu@meta.data$Percent_mito)
## [1]  0.0000 19.9967

Visualize UMAP

DimPlot(so_wu,
        raster = TRUE)+
  coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

DimPlot(so_wu,
        raster = TRUE,
        group.by = 'celltype_major',
        label = TRUE)+
  coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

DimPlot(so_wu,
        raster = TRUE,
        group.by = 'normal_cell_call',
        label = TRUE)+
  coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

DimPlot(so_wu,
        raster = TRUE,
        group.by = 'celltype_minor',
        label = TRUE)+
  coord_equal()+
  theme(legend.position = 'none')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

DimPlot(so_wu,
        raster = TRUE,
        group.by = 'celltype_minor',
        label = FALSE)+
  coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

DimPlot(so_wu,
        raster = TRUE,
        group.by = 'celltype_subset',
        label = TRUE)+
  coord_equal()+
  theme(legend.position = 'none')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

DimPlot(so_wu,
        raster = TRUE,
        group.by = 'subtype',
        label = TRUE,
        shuffle = TRUE)+
  coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

DimPlot(so_wu,
        raster = TRUE,
        group.by = 'subtype',
        label = TRUE)+
  facet_wrap(~subtype)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

Celltype assignments

Review wu et all celltype labels

so_wu@meta.data %>%
  as.tibble() %>%
  dplyr::select(celltype_major, celltype_minor, celltype_subset) %>%
  distinct() %>%
  DT::datatable()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
so_wu_labels_freq <- so_wu@meta.data %>%
  dplyr::select(celltype_major, celltype_minor, celltype_subset) %>%
  group_by(celltype_major, celltype_minor, celltype_subset) %>%
  summarize(n_cells = n())
## `summarise()` has grouped output by 'celltype_major', 'celltype_minor'. You can
## override using the `.groups` argument.
ggplot(so_wu_labels_freq, aes(y = n_cells, axis1 = celltype_major, axis2 = celltype_minor, axis3 = celltype_subset, fill = celltype_major))+
    geom_alluvium()+
    geom_stratum()+
    geom_label(stat = "stratum", aes(label = after_stat(stratum)))+
    theme_minimal()+
    scale_x_discrete(limits = c('celltype_major', 'celltype_minor', 'celltype_subset'))+
    labs(fill = 'celltype_major')
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.

Recode from wu celltype_major to ‘lineage’ equivalent from Myc;Ptenfl analysis

unique(so_wu@meta.data$celltype_major)
## [1] "Endothelial"       "CAFs"              "PVL"              
## [4] "B-cells"           "T-cells"           "Myeloid"          
## [7] "Normal Epithelial" "Plasmablasts"      "Cancer Epithelial"
celltype_major_l1_dict <- rbind(c('Endothelial', 'endothelial'),
                                c('CAFs', 'fibroblast'),
                                c('PVL', 'perivascular'),
                                c('B-cells', 'lymphoid'),
                                c('T-cells', 'lymphoid'),
                                c('Myeloid', 'myeloid'),
                                c('Normal Epithelial', 'epithelial'),
                                c('Plasmablasts', 'lymphoid'),
                                c('Cancer Epithelial', 'epithelial'))

DT::datatable(celltype_major_l1_dict)
so_wu@meta.data$celltype_l1 <- plyr::mapvalues(x = so_wu@meta.data$celltype_major,
                                         from = celltype_major_l1_dict[,1],
                                         to = celltype_major_l1_dict[,2])

DimPlot(so_wu,
        group.by = 'celltype_major',
        label = TRUE)+
  coord_equal()+
  theme(legend.position = 'none')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

DimPlot(so_wu,
        group.by = 'celltype_l1',
        label = TRUE)+
  theme(legend.position = 'none')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

Find orthologous features

Load Myc;Ptenfl data

so_merge <- readRDS('analysis_files/s4_so_merge.rds')

Idents(so_merge) <- 'cluster_l2'

Find orthologous (shared) features across the two data sets

library(nichenetr)
mycpten_features <- tibble(orig.features = rownames(so_merge)) %>%
  mutate(converted.features = convert_mouse_to_human_symbols(orig.features)) %>%
  mutate(new.features = converted.features)

mycpten_features$new.features[is.na(mycpten_features$new.features)] <- mycpten_features$orig.features[is.na(mycpten_features$new.features)]

ortho_features <- mycpten_features$converted.features[! is.na(mycpten_features$converted.features)]

scPred: Wu celltype_minor -> mycpten data

Subset Wu data to only use shared genes

library(scPred)

if(file.exists('analysis_files/s5_so_wu_shared_trained.rds')){
  print('Loading pre-trained scPred model/Seurat object: s5_so_wu_shared_trained.rds file')
  
  so_wu_shared <- readRDS('analysis_files/s5_so_wu_shared_trained.rds')
  
}else{
  print('Could not find pre-trained scPred model, subseting to shared ortholog features and preparing for model training:')
  
  
  
  # Subset to only shared features with mouse
  so_wu_shared <- subset(so_wu,
                         features = ortho_features)
  
  #subset = celltype_minor %in% names(celltype_minor_table)[celltype_minor_table > 100]
  
  dim(so_wu)
  dim(so_wu_shared)
  
  # Find variable features, run PCA and UMAP then visualize
  so_wu_shared <- FindVariableFeatures(so_wu_shared)
  so_wu_shared <- ScaleData(so_wu_shared,
                            features = VariableFeatures(so_wu_shared))
  so_wu_shared <- RunPCA(so_wu_shared)
  ElbowPlot(so_wu_shared)
  so_wu_shared <- RunUMAP(so_wu_shared,
                          dims = 1:10)
}
## [1] "Loading pre-trained scPred model/Seurat object: s5_so_wu_shared_trained.rds file"
Idents(so_wu_shared) <- 'celltype_minor'

DimPlot(so_wu_shared,
        group.by = 'celltype_minor',
        label = TRUE,
        repel = TRUE)+
  coord_equal()+
  theme(legend.position = 'none')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

Build predictive model

if(file.exists('analysis_files/s5_so_wu_shared_trained.rds')){
  
  print('Classifier already trained, not re-computing')
  
}else{
  
  print('Training model')
  so_wu_shared <- getFeatureSpace(so_wu_shared,
                         'celltype_minor')

  so_wu_shared <- trainModel(so_wu_shared,
                             model = 'mda')
  
  print('Saving trained model')
  
  saveRDS(so_wu_shared, file = 'analysis_files/s5_so_wu_shared_trained.rds')

  # saveRDS(so_wu_shared@misc$scPred, file = 'analysis_files/s5_scpred_wu_shared.rds')
  
}
## [1] "Classifier already trained, not re-computing"

Look at results

so_wu_shared@misc$scPred
## 'scPred' object
## ✔  Prediction variable = celltype_minor 
## ✔  Discriminant features per cell type
## ✔  Training model(s)
## Summary
## 
## |Cell type                   |     n| Features|Method |   ROC|  Sens|  Spec|
## |:---------------------------|-----:|--------:|:------|-----:|-----:|-----:|
## |B cells Memory              |  2581|       50|mda    | 0.990| 0.946| 0.991|
## |B cells Naive               |   625|       50|mda    | 0.993| 0.958| 0.981|
## |CAFs MSC iCAF-like          |  3153|       50|mda    | 0.987| 0.903| 0.986|
## |CAFs myCAF-like             |  3420|       50|mda    | 0.996| 0.858| 0.993|
## |Cancer Basal SC             |  4312|       50|mda    | 0.967| 0.774| 0.979|
## |Cancer Cycling              |  5359|       50|mda    | 0.969| 0.774| 0.972|
## |Cancer Her2 SC              |  3708|       50|mda    | 0.972| 0.763| 0.976|
## |Cancer LumA SC              |  7742|       50|mda    | 0.988| 0.973| 0.967|
## |Cancer LumB SC              |  3368|       50|mda    | 0.973| 0.902| 0.957|
## |Cycling PVL                 |    50|       50|mda    | 0.988| 0.760| 0.984|
## |Cycling T-cells             |  1528|       50|mda    | 0.982| 0.781| 0.992|
## |Cycling_Myeloid             |   463|       50|mda    | 0.980| 0.758| 0.978|
## |DCs                         |   955|       50|mda    | 0.997| 0.924| 0.991|
## |Endothelial ACKR1           |  4611|       50|mda    | 0.999| 0.973| 0.993|
## |Endothelial CXCL12          |  1644|       50|mda    | 0.998| 0.941| 0.993|
## |Endothelial Lymphatic LYVE1 |   203|       50|mda    | 0.979| 0.808| 0.980|
## |Endothelial RGS5            |  1147|       50|mda    | 0.994| 0.939| 0.983|
## |Luminal Progenitors         |  1992|       50|mda    | 0.999| 0.985| 0.998|
## |Macrophage                  |  5929|       50|mda    | 0.994| 0.917| 0.988|
## |Mature Luminal              |  1265|       50|mda    | 0.989| 0.795| 0.986|
## |Monocyte                    |  2328|       50|mda    | 0.992| 0.885| 0.983|
## |Myoepithelial               |  1098|       50|mda    | 1.000| 0.982| 0.999|
## |NK cells                    |  1846|       50|mda    | 0.994| 0.904| 0.986|
## |NKT cells                   |  1122|       50|mda    | 0.993| 0.855| 0.986|
## |Plasmablasts                |  3524|       50|mda    | 0.998| 0.980| 0.998|
## |PVL Differentiated          |  3487|       50|mda    | 0.998| 0.926| 0.995|
## |PVL Immature                |  1886|       50|mda    | 0.996| 0.905| 0.990|
## |T cells CD4+                | 19231|       50|mda    | 0.988| 0.941| 0.970|
## |T cells CD8+                | 11487|       50|mda    | 0.979| 0.859| 0.968|
get_probabilities(so_wu_shared) %>%
  head()
##                          Endothelial ACKR1 Endothelial RGS5 Endothelial CXCL12
## CID3586_AAGACCTCAGCATGAG                 1     2.003150e-07       4.839682e-22
## CID3586_AAGGTTCGTAGTACCT                 1     7.397193e-05       2.601051e-24
## CID3586_ACCAGTAGTTGTGGCC                 1     7.352694e-09       4.002143e-23
## CID3586_ACCCACTAGATGTCGG                 1     7.303812e-01       2.831623e-23
## CID3586_ACTGATGGTCAACTGT                 1     2.536747e-02       2.212364e-15
## CID3586_ACTTGTTAGGGAAACA                 1     8.911516e-06       3.305400e-30
##                          CAFs MSC iCAF-like CAFs myCAF-like PVL Differentiated
## CID3586_AAGACCTCAGCATGAG       1.868347e-21    4.099174e-19       4.077561e-38
## CID3586_AAGGTTCGTAGTACCT       8.341724e-09    1.160443e-19       9.414944e-18
## CID3586_ACCAGTAGTTGTGGCC       1.248101e-22    4.813984e-16       9.975417e-21
## CID3586_ACCCACTAGATGTCGG       4.469753e-09    2.785414e-21       1.015892e-24
## CID3586_ACTGATGGTCAACTGT       8.192184e-11    1.000265e-24       1.912015e-33
## CID3586_ACTTGTTAGGGAAACA       3.197197e-10    6.744133e-22       2.050644e-25
##                          PVL Immature Endothelial Lymphatic LYVE1
## CID3586_AAGACCTCAGCATGAG 1.515444e-29                0.3568192593
## CID3586_AAGGTTCGTAGTACCT 1.845669e-26                0.2676977985
## CID3586_ACCAGTAGTTGTGGCC 8.502484e-29                0.0005419999
## CID3586_ACCCACTAGATGTCGG 1.253373e-22                0.0001489643
## CID3586_ACTGATGGTCAACTGT 3.105859e-25                0.0005213064
## CID3586_ACTTGTTAGGGAAACA 7.892852e-24                0.7960905689
##                          B cells Memory B cells Naive T cells CD8+ T cells CD4+
## CID3586_AAGACCTCAGCATGAG   1.183670e-07  1.122082e-44 2.144809e-10 7.955333e-06
## CID3586_AAGGTTCGTAGTACCT   5.889650e-05  3.199620e-11 1.705763e-05 3.448587e-06
## CID3586_ACCAGTAGTTGTGGCC   5.350744e-04  1.904061e-11 2.082512e-04 1.032236e-12
## CID3586_ACCCACTAGATGTCGG   4.302226e-08  5.108091e-11 7.562800e-12 2.637520e-05
## CID3586_ACTGATGGTCAACTGT   1.788890e-05  1.343078e-09 8.677196e-09 8.326259e-06
## CID3586_ACTTGTTAGGGAAACA   8.160168e-24  1.225567e-11 4.658081e-05 5.261893e-07
##                              NK cells Cycling T-cells    NKT cells   Macrophage
## CID3586_AAGACCTCAGCATGAG 4.160530e-10    1.076226e-10 5.205474e-08 2.152898e-17
## CID3586_AAGGTTCGTAGTACCT 2.852535e-10    6.803017e-08 9.583610e-16 2.592094e-13
## CID3586_ACCAGTAGTTGTGGCC 4.354209e-08    2.053181e-07 3.561144e-08 2.223510e-17
## CID3586_ACCCACTAGATGTCGG 2.761888e-13    8.245699e-12 2.874633e-14 1.859639e-16
## CID3586_ACTGATGGTCAACTGT 8.206602e-15    3.081613e-11 1.756351e-08 7.244806e-14
## CID3586_ACTTGTTAGGGAAACA 2.259881e-10    5.484310e-16 9.693075e-08 2.263377e-19
##                              Monocyte Cycling_Myeloid          DCs
## CID3586_AAGACCTCAGCATGAG 2.097473e-16    3.209078e-21 1.714277e-17
## CID3586_AAGGTTCGTAGTACCT 2.781148e-13    1.889234e-07 1.524376e-15
## CID3586_ACCAGTAGTTGTGGCC 6.713838e-11    1.237469e-09 6.374311e-13
## CID3586_ACCCACTAGATGTCGG 1.487502e-11    3.368482e-08 1.949465e-12
## CID3586_ACTGATGGTCAACTGT 2.172737e-19    7.703806e-16 9.727881e-23
## CID3586_ACTTGTTAGGGAAACA 1.613937e-15    4.253898e-12 1.013405e-13
##                          Myoepithelial Luminal Progenitors Mature Luminal
## CID3586_AAGACCTCAGCATGAG  2.261935e-55        1.844934e-27   2.347579e-05
## CID3586_AAGGTTCGTAGTACCT  4.061256e-50        2.609801e-25   4.162487e-05
## CID3586_ACCAGTAGTTGTGGCC  1.560566e-55        1.153612e-28   4.298928e-05
## CID3586_ACCCACTAGATGTCGG  3.032393e-58        1.912968e-22   1.790728e-04
## CID3586_ACTGATGGTCAACTGT  1.873096e-50        5.904651e-31   5.638292e-04
## CID3586_ACTTGTTAGGGAAACA  3.115335e-51        1.174433e-28   1.477751e-04
##                          Plasmablasts Cancer Cycling Cancer Her2 SC
## CID3586_AAGACCTCAGCATGAG 7.399475e-21   4.937141e-06   2.302996e-06
## CID3586_AAGGTTCGTAGTACCT 1.245102e-17   1.497658e-05   6.648358e-07
## CID3586_ACCAGTAGTTGTGGCC 2.661955e-19   6.936914e-06   3.690251e-06
## CID3586_ACCCACTAGATGTCGG 1.802947e-13   3.594268e-06   5.971771e-06
## CID3586_ACTGATGGTCAACTGT 1.398909e-18   8.908713e-07   1.936374e-05
## CID3586_ACTTGTTAGGGAAACA 3.603510e-15   1.516829e-05   1.128390e-07
##                          Cancer LumB SC Cancer Basal SC  Cycling PVL
## CID3586_AAGACCTCAGCATGAG   9.944962e-10    8.567279e-06 5.688735e-13
## CID3586_AAGGTTCGTAGTACCT   4.367787e-08    3.867812e-05 7.328937e-14
## CID3586_ACCAGTAGTTGTGGCC   1.761924e-05    4.478141e-06 1.395569e-27
## CID3586_ACCCACTAGATGTCGG   5.558163e-05    4.676754e-06 3.908383e-14
## CID3586_ACTGATGGTCAACTGT   4.344627e-14    1.446185e-06 4.150082e-13
## CID3586_ACTTGTTAGGGAAACA   4.477743e-15    7.689223e-05 1.803005e-26
##                          Cancer LumA SC
## CID3586_AAGACCTCAGCATGAG   3.777797e-06
## CID3586_AAGGTTCGTAGTACCT   9.585564e-07
## CID3586_ACCAGTAGTTGTGGCC   1.944880e-06
## CID3586_ACCCACTAGATGTCGG   2.094946e-06
## CID3586_ACTGATGGTCAACTGT   7.478631e-09
## CID3586_ACTTGTTAGGGAAACA   7.570103e-06
plot_probabilities(so_wu_shared)

Convert so_merge to subset of ortholog features

if(file.exists('analysis_files/s5_so_merge_subset_mda_scpred.rds')){
  print('s5_so_merge_subset_mda_scpred.rds already exists, not recomputing')
  
}else{
  print('Applying classifier to myc;ptenfl data')
  
  so_merge_subset <- subset(so_merge,
                          features = mycpten_features$orig.features[!is.na(mycpten_features$converted.features)])

  # Rename the features
  rownames(so_merge_subset@assays$RNA@counts) <- plyr::mapvalues(x = rownames(so_merge_subset@assays$RNA@counts),
                                                                 from = mycpten_features$orig.features,
                                                                 to = mycpten_features$converted.features,
                                                                 warn_missing = FALSE)
  
  rownames(so_merge_subset@assays$RNA@data) <- plyr::mapvalues(x = rownames(so_merge_subset@assays$RNA@data),
                                                                 from = mycpten_features$orig.features,
                                                                 to = mycpten_features$converted.features,
                                                                 warn_missing = FALSE)
  
  
  # Find new variable features and rescale
  
  so_merge_subset <- scPredict(so_merge_subset, so_wu_shared)
  
  # Visualize assigned label
  DimPlot(so_merge_subset,
          group.by = 'scpred_prediction')+
    coord_equal()
  
  # Celltype_full vs assigned label
  
  freq <- so_merge_subset@meta.data %>%
    dplyr::select(celltype_full, scpred_prediction) %>%
    table()
  
   nmi <- aricode::NMI(c1 = so_merge_subset@meta.data$celltype_full,
                     c2 = so_merge_subset@meta.data$scpred_prediction)
   
   ari <- aricode::ARI(c1 = so_merge_subset@meta.data$celltype_full,
                     c2 = so_merge_subset@meta.data$scpred_prediction)
  
  pheatmap::pheatmap(prop.table(freq, margin = 1),
                     display_numbers = freq,
                     main = paste0('NMI: ', nmi, '\n ARI: ', ari))
  
  saveRDS(so_merge_subset, file = 'analysis_files/s5_so_merge_subset_mda_scpred.rds')
}
## [1] "s5_so_merge_subset_mda_scpred.rds already exists, not recomputing"

Split by celltype_l1

# heatmap split by lineage

for(i in c('epithelial', 'fibroblast', 'myeloid', 'lymphoid')){
  curr_subset <- subset(so_merge_subset, subset = lineage == i)
  
  curr_freq <- curr_subset@meta.data %>%
    dplyr::select(celltype_full, scpred_prediction) %>%
    table()
  
 labels <- curr_subset@meta.data %>%
    dplyr::select(celltype_full, scpred_prediction)
  
  # filter empty rows/columns
  
  curr_freq <- curr_freq[rowSums(curr_freq) != 0, colSums(curr_freq) != 0]

  p1 <- pheatmap::pheatmap(prop.table(curr_freq, margin = 1),
                           display_numbers = round(prop.table(curr_freq, margin = 1), 2),
                           main = i)
  
  print(p1)
}

Cross-species integration with UINMF via RLiger

convert from mouse gene to human gene via known orthologs.

library(rliger)
## Loading required package: cowplot
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: patchwork
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
## 
##     align_plots
library(SeuratWrappers)

if(file.exists('analysis_files/s5_xspecies_uinmf_so.rds')){
  print('Loading saved UINMF integrated .rds')
  
  species.liger <- readRDS('analysis_files/s5_xspecies_uinmf_rliger.rds')
  so_rliger <- readRDS('analysis_files/s5_xspecies_uinmf_so.rds')
  
}else{
  print('Did not find saved Seurat integrated data, processing new one')
  
  #Remove files from classifier
  rm(so_wu_shared)
  rm(so_merge_subset)
  
  # Pull metadata
  wu_meta <- so_wu@meta.data
  mycpten_meta <- so_merge@meta.data
  
  # pull counts
  wu_counts <- so_wu@assays$RNA@counts
  mycpten_counts <- so_merge@assays$RNA@counts
  
  # Remove unneeded seurat objects to make room for rliger object
  rm(so_wu)
  rm(so_merge)
  gc()
  
  # Convert row names to human ortholog when possible
  
  rownames(mycpten_counts) <- mycpten_features$new.features
  
  tibble(n_mouse_features = nrow(mycpten_counts),
         n_human_features = nrow(wu_counts),
         n_shared_features = length(intersect(rownames(wu_counts), rownames(mycpten_counts)))) %>%
    knitr::kable()
  
  # Add ortholog filtered count
  
  # Combine into cross-species liger object
  species.liger <- createLiger(list(mouse = mycpten_counts,
                                    human = wu_counts),
                               take.gene.union = FALSE)

}
## [1] "Loading saved UINMF integrated .rds"

Normalize, select genes, scale and perform joint matrix factorization

if(!file.exists('analysis_files/s5_xspecies_uinmf_so.rds')){
  species.liger <- normalize(species.liger)

  species.liger <- selectGenes(species.liger,
                               var.thres= 0.3,
                               unshared = TRUE,
                               unshared.datasets = list(1,2),
                               unshared.thresh = 0.3)
  
  species.liger <- scaleNotCenter(species.liger)  

  species.liger <- optimizeALS(species.liger,
                             lambda = 5,
                             use.unshared = TRUE,
                             thresh = 1e-10,
                             k = 50,
                             nrep = 5)
  
  species.liger <- quantile_norm(species.liger,
                                 ref_dataset = "human")
}

Quantile Norm, clustering and process UMAP

if(!file.exists('analysis_files/s5_xspecies_uinmf_so.rds')){
  
  # delete below
    species.liger <- optimizeALS(species.liger,
                             lambda = 5,
                             use.unshared = TRUE,
                             thresh = 1e-10,
                             k = 50,
                             nrep = 5)
  
  species.liger <- quantile_norm(species.liger,
                                 ref_dataset = "human")
  
  # Delete above
  
  species.liger <- louvainCluster(species.liger)
  
  species.liger <- runUMAP(species.liger)
  
  saveRDS(species.liger, 'analysis_files/s5_xspecies_uinmf_rliger.rds')
  
  # Convert to seurat object and save
  
  # Convert to seurat object and run umap + find neighbors + unoptimized clustering
  so_rliger <- ligerToSeurat(species.liger)
  so_rliger <- RunUMAP(so_rliger, reduction = 'inmf', dims = 1:50)
  so_rliger <- FindNeighbors(so_rliger, 
                             reduction = 'inmf',
                             dims = 1:50)
  
  so_rliger <- FindClusters(so_rliger,
                            resolution = 0.9,
                            algorithm = 1) # Using optimized resolution from code below
  
  # Add metadata back from two original seurat objects
  
  wu_meta$barcode <- paste0('human_', rownames(wu_meta))
  mycpten_meta$barcode <- paste0('mouse_', rownames(mycpten_meta))
  
  so_rliger_meta <- so_rliger@meta.data
  so_rliger_meta$barcode <- rownames(so_rliger_meta)
  so_rliger_meta <- full_join(x = so_rliger_meta,
                              y = wu_meta,
                              by = 'barcode',
                              suffix = c('', ''))
  
  so_rliger_meta <- left_join(x = so_rliger_meta,
                              y = mycpten_meta,
                              by = 'barcode',
                              suffix = c('', ''))
  
  so_rliger_meta$species <- str_split(so_rliger_meta$barcode, pattern = '_', simplify = TRUE)[,1]
  
  
  so_rliger@meta.data <- so_rliger_meta
  rownames(so_rliger@meta.data) <- so_rliger_meta$barcode
  
}

Optimize silhouette width on integrated seurat object NOTE: can’t run leiden algorithm, reverting to louvain (leiden requires conversion to dense matrix format)

Prepare silhoutte scoring function

library(bluster)

cluster_sweep <- function(resolution = seq(from = 0.3, to = 0.8, by = 0.1), seurat_object, embedding, ndims = ncol(emedding)){
  
  
  for(i in resolution){
    seurat_object <- FindClusters(seurat_object,
                                resolution = i,
                                algorithm = 1)
  
    p1 <- DimPlot(seurat_object,
                  group.by = 'seurat_clusters',
                  label = TRUE)+
      coord_equal()+
      ggtitle(paste0('Resolution: ', resolution))
    
    print(p1)
    
    curr_clusters <- seurat_object@meta.data$seurat_clusters 
    
    sil <- approxSilhouette(x = embedding,
                            clusters = curr_clusters)
    
    boxplot(split(sil$width, curr_clusters),
            main = paste0('Resolution: ', i, '\n Mean sil.width: ', mean(sil$width)))
    
    best.choice <- ifelse(sil$width > 0,
                          curr_clusters,
                          sil$other)
    
    table(Assigned=curr_clusters, Closest=best.choice)
    
    # 
    rmsd <- clusterRMSD(embedding, curr_clusters)
    barplot(rmsd,
            main = paste0('Resolution: ', i, '\n Mean rmsd: ', mean(rmsd)))
    
    
    if(i == resolution[1]){
      qc <- tibble(res = i,
                   n_clusters = length(unique(sil$cluster)),
                   mean_sil_width = mean(sil$width),
                   mean_rmsd = mean(rmsd))
    }else{
      qc <- rbind(qc,
                  tibble(res = i,
                  n_clusters = length(unique(sil$cluster)),
                  mean_sil_width = mean(sil$width),
                  mean_rmsd = mean(rmsd)))
    }
    
  }
  return(qc)
}

Find best silhouette width

cluster_qc <- cluster_sweep(seurat_object = so_rliger,
                            embedding = so_rliger@reductions$inmf@cell.embeddings,
                            resolution = seq(from = 0.3, to = 1.2, by = 0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9660
## Number of communities: 26
## Elapsed time: 64 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9602
## Number of communities: 30
## Elapsed time: 59 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9552
## Number of communities: 31
## Elapsed time: 58 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9507
## Number of communities: 37
## Elapsed time: 59 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9467
## Number of communities: 39
## Elapsed time: 64 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9434
## Number of communities: 42
## Elapsed time: 58 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9400
## Number of communities: 43
## Elapsed time: 67 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9367
## Number of communities: 44
## Elapsed time: 69 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9333
## Number of communities: 45
## Elapsed time: 59 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 114106
## Number of edges: 4243103
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9300
## Number of communities: 47
## Elapsed time: 59 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

cluster_qc_gathered <- cluster_qc %>%
  gather(- c(res), value = 'value', key = 'metric')

ggplot(cluster_qc_gathered, aes(x = res, y = value, color = metric))+
  geom_point()+
  geom_line()+
  theme_bw()+
  facet_wrap(~metric, ncol = 1, scales = 'free_y')

Cluster Seurat object and save

so_rliger <- FindClusters(so_rliger,
                          resolution = 0.6,
                          algorithm = 1)

# Save seurat object as .rds file

saveRDS(so_rliger, 'analysis_files/s5_xspecies_uinmf_so.rds')

sessionInfo()

sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bluster_1.6.0        SeuratWrappers_0.3.0 rliger_1.0.0        
##  [4] patchwork_1.1.2      Matrix_1.5-1         cowplot_1.1.1       
##  [7] scPred_1.9.2         nichenetr_1.1.0      ggalluvial_0.12.3   
## [10] SeuratObject_4.1.3   Seurat_4.3.0         forcats_0.5.2       
## [13] stringr_1.5.0        dplyr_1.0.10         purrr_0.3.4         
## [16] readr_2.1.3          tidyr_1.2.1          tibble_3.1.8        
## [19] ggplot2_3.4.0        tidyverse_1.3.2     
## 
## loaded via a namespace (and not attached):
##   [1] scattermore_0.8        ModelMetrics_1.2.2.2   R.methodsS3_1.8.2     
##   [4] bit64_4.0.5            knitr_1.41             R.utils_2.12.2        
##   [7] irlba_2.3.5.1          data.table_1.14.6      rpart_4.1.19          
##  [10] hardhat_1.2.0          doParallel_1.0.17      generics_0.1.3        
##  [13] BiocGenerics_0.44.0    RANN_2.6.1             proxy_0.4-27          
##  [16] future_1.29.0          DiagrammeR_1.0.9       bit_4.0.4             
##  [19] tzdb_0.3.0             spatstat.data_3.0-0    xml2_1.3.3            
##  [22] lubridate_1.8.0        httpuv_1.6.6           assertthat_0.2.1      
##  [25] gargle_1.2.1           gower_1.0.0            xfun_0.35             
##  [28] hms_1.1.2              jquerylib_0.1.4        evaluate_0.18         
##  [31] promises_1.2.0.1       fansi_1.0.3            caTools_1.18.2        
##  [34] dbplyr_2.2.1           readxl_1.4.1           igraph_1.3.5          
##  [37] DBI_1.1.3              htmlwidgets_1.5.4      spatstat.geom_3.0-3   
##  [40] googledrive_2.0.0      stats4_4.2.2           ellipsis_0.3.2        
##  [43] riverplot_0.10         crosstalk_1.2.0        backports_1.4.1       
##  [46] bookdown_0.30          deldir_1.0-6           vctrs_0.5.1           
##  [49] remotes_2.4.2          ROCR_1.0-11            abind_1.4-5           
##  [52] caret_6.0-93           cachem_1.0.6           withr_2.5.0           
##  [55] progressr_0.11.0       checkmate_2.1.0        sctransform_0.3.5     
##  [58] fdrtool_1.2.17         mclust_6.0.0           goftest_1.2-3         
##  [61] cluster_2.1.4          lazyeval_0.2.2         crayon_1.5.2          
##  [64] hdf5r_1.3.7            spatstat.explore_3.0-5 recipes_1.0.3         
##  [67] pkgconfig_2.0.3        labeling_0.4.2         nlme_3.1-160          
##  [70] vipor_0.4.5            nnet_7.3-18            rlang_1.0.6           
##  [73] globals_0.16.2         lifecycle_1.0.3        miniUI_0.1.1.1        
##  [76] rsvd_1.0.5             modelr_0.1.10          cellranger_1.1.0      
##  [79] randomForest_4.7-1.1   polyclip_1.10-0        matrixStats_0.63.0    
##  [82] lmtest_0.9-40          zoo_1.8-11             reprex_2.0.2          
##  [85] base64enc_0.1-3        beeswarm_0.4.0         ggridges_0.5.4        
##  [88] GlobalOptions_0.1.2    googlesheets4_1.0.1    png_0.1-7             
##  [91] viridisLite_0.4.1      rjson_0.2.21           bitops_1.0-7          
##  [94] R.oo_1.25.0            KernSmooth_2.23-20     visNetwork_2.1.2      
##  [97] pROC_1.18.0            shape_1.4.6            parallelly_1.32.1     
## [100] spatstat.random_3.0-1  jpeg_0.1-9             S4Vectors_0.36.0      
## [103] scales_1.2.1           magrittr_2.0.3         plyr_1.8.7            
## [106] ica_1.0-3              compiler_4.2.2         RColorBrewer_1.1-3    
## [109] clue_0.3-63            fitdistrplus_1.1-8     cli_3.4.0             
## [112] listenv_0.8.0          pbapply_1.6-0          htmlTable_2.4.1       
## [115] Formula_1.2-4          MASS_7.3-58.1          tidyselect_1.2.0      
## [118] stringi_1.7.8          highr_0.9              yaml_2.3.6            
## [121] latticeExtra_0.6-30    ggrepel_0.9.2          grid_4.2.2            
## [124] sass_0.4.4             tools_4.2.2            future.apply_1.10.0   
## [127] parallel_4.2.2         circlize_0.4.15        rstudioapi_0.14       
## [130] foreach_1.5.2          foreign_0.8-83         gridExtra_2.3         
## [133] prodlim_2019.11.13     rmdformats_1.0.4       farver_2.1.1          
## [136] Rtsne_0.16             digest_0.6.29          BiocManager_1.30.19   
## [139] rgeos_0.5-9            FNN_1.1.3.1            shiny_1.7.3           
## [142] lava_1.7.0             Rcpp_1.0.9             broom_1.0.1           
## [145] later_1.3.0            harmony_0.1.1          RcppAnnoy_0.0.20      
## [148] httr_1.4.4             ComplexHeatmap_2.12.1  colorspace_2.0-3      
## [151] rvest_1.0.3            fs_1.5.2               tensor_1.5            
## [154] reticulate_1.26        IRanges_2.32.0         splines_4.2.2         
## [157] uwot_0.1.14            spatstat.utils_3.0-1   sp_1.5-1              
## [160] plotly_4.10.1          xtable_1.8-4           jsonlite_1.8.3        
## [163] timeDate_4021.106      ipred_0.9-13           R6_2.5.1              
## [166] Hmisc_4.7-2            pillar_1.8.1           htmltools_0.5.3       
## [169] mime_0.12              BiocParallel_1.32.4    glue_1.6.2            
## [172] fastmap_1.1.0          DT_0.26                BiocNeighbors_1.16.0  
## [175] class_7.3-20           codetools_0.2-18       utf8_1.2.2            
## [178] lattice_0.20-45        bslib_0.4.1            spatstat.sparse_3.0-0 
## [181] ggbeeswarm_0.6.0       leiden_0.4.3           interp_1.1-3          
## [184] survival_3.4-0         limma_3.54.0           rmarkdown_2.18        
## [187] munsell_0.5.0          e1071_1.7-12           GetoptLong_1.0.5      
## [190] iterators_1.0.14       haven_2.5.1            reshape2_1.4.4        
## [193] gtable_0.3.1